{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# XGBoost Parameter Tuning for Rent Listing Inqueries"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 第六步：再次cv寻找最佳的参数n_estimators"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "首先 import 必要的模块"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from xgboost import XGBClassifier\n",
    "import xgboost as xgb\n",
    "\n",
    "import pandas as pd \n",
    "import numpy as np\n",
    "\n",
    "import math\n",
    "\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "\n",
    "from sklearn.metrics import log_loss\n",
    "\n",
    "from matplotlib import pyplot\n",
    "import seaborn as sns\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 读取数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# path to where the data lies\n",
    "dpath = './data/'\n",
    "train = pd.read_csv(dpath +\"RentListingInquries_FE_train.csv\")\n",
    "test = pd.read_csv(dpath +\"RentListingInquries_FE_test.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "Y = train['interest_level']\n",
    "X = train.drop([\"interest_level\"], axis=1)\n",
    "X = np.array(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "X_train=X\n",
    "y_train=Y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RangeIndex(start=0, stop=74659, step=1)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test.index"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "各类样本不均衡，交叉验证是采用StratifiedKFold，在每折采样时各类样本按比例采样"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# prepare cross validation\n",
    "kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "使用已经调整好的参数，再次调整弱学习器的数目"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#直接调用xgboost内嵌的交叉验证（cv），可对连续的n_estimators参数进行快速交叉验证\n",
    "#而GridSearchCV只能对有限个参数进行交叉验证\n",
    "def modelfit(alg, X_train,y_train, cv_folds=None, early_stopping_rounds=10):\n",
    "    xgb_param = alg.get_xgb_params()\n",
    "    xgb_param['num_class'] = 3\n",
    "    \n",
    "    #直接调用xgboost，而非sklarn的wrapper类\n",
    "    xgtrain = xgb.DMatrix(X_train, label = y_train)\n",
    "        \n",
    "    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], folds =cv_folds,\n",
    "             metrics='mlogloss', early_stopping_rounds=early_stopping_rounds,)\n",
    "  \n",
    "    cvresult.to_csv('1_nestimators.csv', index_label = 'n_estimators')\n",
    "    \n",
    "    #最佳参数n_estimators\n",
    "    n_estimators = cvresult.shape[0]\n",
    "    \n",
    "    # 采用交叉验证得到的最佳参数n_estimators，训练模型\n",
    "    alg.set_params(n_estimators = n_estimators)\n",
    "    alg.fit(X_train,y_train,eval_metric='mlogloss')\n",
    "    print(n_estimators)\n",
    "        \n",
    "    #Predict training set:\n",
    "    train_predprob = alg.predict_proba(X_train)\n",
    "    logloss = log_loss(y_train, train_predprob)\n",
    "\n",
    "   #Print model report:\n",
    "    print (\"logloss of train :\" )\n",
    "    print(logloss)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "250\n",
      "logloss of train :\n",
      "0.47813485771025255\n"
     ]
    }
   ],
   "source": [
    "#params = {\"objective\": \"multi:softprob\", \"eval_metric\":\"mlogloss\", \"num_class\": 9}\n",
    "xgb1 = XGBClassifier(\n",
    "        learning_rate =0.1,\n",
    "        n_estimators=1000,  #数值大没关系，cv会自动返回合适的n_estimators\n",
    "        max_depth=6,\n",
    "        min_child_weight=7,\n",
    "        #gamma=0,\n",
    "        reg_alpha=0.1,\n",
    "        reg_lambda=0.5,\n",
    "        subsample=0.8,\n",
    "        colsample_bytree=0.8,\n",
    "        colsample_bylevel=0.7,\n",
    "        objective= 'multi:softprob',\n",
    "        nthread=-1,\n",
    "        seed=3)\n",
    "\n",
    "\n",
    "modelfit(xgb1, X_train,y_train, cv_folds = kfold)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 选择最佳 n_estimators=250"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([9.7049130e-03, 9.8839710e-03, 7.4273027e-02, 6.9259420e-02,\n",
       "       9.0352386e-02, 1.1925226e-02, 1.1459677e-02, 0.0000000e+00,\n",
       "       5.8730841e-03, 4.3117031e-02, 2.1307835e-02, 5.4827388e-02,\n",
       "       4.5731269e-02, 8.4156999e-03, 3.2230339e-03, 6.1595761e-03,\n",
       "       1.8263859e-03, 7.7710929e-03, 6.2670102e-03, 6.0521415e-03,\n",
       "       3.3662799e-03, 3.9034523e-03, 4.6053573e-02, 1.1312849e-01,\n",
       "       4.4513680e-02, 4.3940697e-02, 5.0171897e-02, 1.0743446e-04,\n",
       "       3.5811488e-05, 1.4324595e-04, 7.1622977e-05, 7.1622977e-05,\n",
       "       2.1128778e-03, 8.5947569e-04, 3.5811488e-05, 9.9197822e-03,\n",
       "       3.9392637e-04, 2.5068043e-04, 1.1459676e-03, 1.7905745e-04,\n",
       "       0.0000000e+00, 7.1622977e-05, 7.1622977e-05, 2.5426156e-03,\n",
       "       0.0000000e+00, 1.0743446e-04, 0.0000000e+00, 1.0743446e-04,\n",
       "       0.0000000e+00, 3.5811489e-04, 2.1486892e-04, 0.0000000e+00,\n",
       "       3.9392637e-04, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,\n",
       "       0.0000000e+00, 8.1650196e-03, 0.0000000e+00, 0.0000000e+00,\n",
       "       3.9034523e-03, 1.6115169e-03, 9.6691016e-04, 4.5838705e-03,\n",
       "       4.6554935e-04, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,\n",
       "       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 5.3717231e-04,\n",
       "       3.5811488e-05, 0.0000000e+00, 1.8263859e-03, 1.7905745e-04,\n",
       "       3.5811488e-05, 1.5040825e-03, 0.0000000e+00, 2.3277467e-03,\n",
       "       0.0000000e+00, 0.0000000e+00, 2.5426156e-03, 7.7352813e-03,\n",
       "       3.4020913e-03, 5.8014612e-03, 0.0000000e+00, 4.2973785e-04,\n",
       "       2.5068043e-04, 1.4324595e-04, 3.5811488e-05, 5.8730841e-03,\n",
       "       0.0000000e+00, 4.0108869e-03, 3.5811488e-05, 0.0000000e+00,\n",
       "       1.8192235e-02, 6.4460677e-04, 2.2203124e-03, 0.0000000e+00,\n",
       "       2.8649191e-04, 7.4487897e-03, 0.0000000e+00, 0.0000000e+00,\n",
       "       0.0000000e+00, 0.0000000e+00, 8.5947569e-03, 5.3717231e-04,\n",
       "       1.1101562e-03, 3.2230339e-04, 6.4460677e-04, 0.0000000e+00,\n",
       "       1.7905745e-04, 5.0852313e-03, 0.0000000e+00, 0.0000000e+00,\n",
       "       3.1872224e-03, 2.5068043e-04, 7.1622977e-05, 0.0000000e+00,\n",
       "       0.0000000e+00, 0.0000000e+00, 1.7905745e-04, 0.0000000e+00,\n",
       "       0.0000000e+00, 1.9338203e-03, 5.3717231e-04, 3.5811489e-04,\n",
       "       1.1961037e-02, 5.3717231e-04, 3.2230339e-04, 6.4460677e-04,\n",
       "       1.0743446e-04, 2.1486892e-04, 0.0000000e+00, 2.4709927e-03,\n",
       "       3.5811488e-05, 4.2973785e-04, 7.8785274e-04, 3.2230339e-04,\n",
       "       0.0000000e+00, 1.7905745e-04, 2.5068043e-04, 0.0000000e+00,\n",
       "       0.0000000e+00, 0.0000000e+00, 5.0136086e-04, 0.0000000e+00,\n",
       "       1.4324596e-03, 1.7905745e-04, 2.8649191e-04, 4.0108869e-03,\n",
       "       0.0000000e+00, 0.0000000e+00, 7.1622978e-04, 1.6473285e-03,\n",
       "       0.0000000e+00, 5.0136086e-04, 2.8649191e-04, 1.4324595e-04,\n",
       "       1.8980089e-03, 3.5811488e-05, 5.1926658e-03, 1.9338203e-03,\n",
       "       2.2203124e-03, 1.4324595e-04, 0.0000000e+00, 7.1622977e-05,\n",
       "       2.4709927e-03, 1.2175906e-03, 1.7905745e-04, 1.7905745e-04,\n",
       "       0.0000000e+00, 1.4324596e-03, 1.0743446e-04, 7.1622977e-05,\n",
       "       3.2230339e-03, 1.0743446e-04, 7.1622977e-05, 3.5811488e-05,\n",
       "       3.5811488e-05, 0.0000000e+00, 0.0000000e+00, 3.2230339e-04,\n",
       "       5.1926658e-03, 8.2366425e-04, 0.0000000e+00, 0.0000000e+00,\n",
       "       5.7298383e-03, 1.7905745e-04, 7.8785274e-04, 3.5811488e-05,\n",
       "       7.1622977e-05, 1.4324595e-04, 2.1486892e-04, 0.0000000e+00,\n",
       "       6.4460677e-04, 1.4324595e-04, 0.0000000e+00, 3.5811488e-05,\n",
       "       9.6691016e-04, 3.5811488e-05, 1.5398940e-03, 0.0000000e+00,\n",
       "       3.5811489e-04, 2.0412549e-03, 7.1622977e-05, 0.0000000e+00,\n",
       "       1.4324595e-04, 0.0000000e+00, 0.0000000e+00, 5.4075345e-03,\n",
       "       3.5811488e-05, 0.0000000e+00, 0.0000000e+00, 4.2973785e-04,\n",
       "       1.7905745e-04, 7.1622977e-05, 2.1486892e-04, 0.0000000e+00,\n",
       "       3.1155995e-03, 3.9392637e-04, 0.0000000e+00, 1.3250251e-03,\n",
       "       7.1622977e-05, 6.4460677e-04, 1.4324595e-04], dtype=float32)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "xgb1.feature_importances_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "numpy.ndarray"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "numpy.ndarray"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test=np.array(test)\n",
    "type(test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "range(0, 74659)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "range(0,test.shape[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 在测试集上预测，并保存预测结果为：概率+id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "preds=xgb1.predict_proba(test)\n",
    "out_df = pd.DataFrame(preds)\n",
    "out_df.columns = [\"high\", \"medium\", \"low\"]\n",
    "out_df[\"id\"] = range(0,test.shape[0])\n",
    "out_df.to_csv(\"xgb_RentListingInqueries.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 将训练好的模型导出，方便后面使用"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pickle\n",
    "# save the model to disk\n",
    "filename = 'finalized_model.sav'\n",
    "pickle.dump(xgb1, open(filename, 'wb'))\n",
    "\n",
    "# some time later...\n",
    "\n",
    "# load the model from disk\n",
    "#loaded_model = pickle.load(open(filename, 'rb'))\n",
    "#result = loaded_model.score(X_test, Y_test)\n",
    "#print(result)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pickle\n",
    "# load the model from disk\n",
    "filename = 'finalized_model.sav'\n",
    "loaded_model = pickle.load(open(filename, 'rb'))\n",
    "#result = loaded_model.score(X_test, Y_test)\n",
    "#print(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\preprocessing\\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
      "  if diff:\n"
     ]
    }
   ],
   "source": [
    "y_map = {2: 'low', 1:'medium',0: 'high'}\n",
    "preds=loaded_model.predict(test)\n",
    "preds= list(map(lambda x: y_map[x],preds))\n",
    "#print(list(preds))\n",
    "out_df = pd.DataFrame(preds)\n",
    "out_df.columns = [\"preds\"]\n",
    "out_df[\"id\"] = range(0,test.shape[0])\n",
    "out_df.to_csv(\"xgb_result_RentListingInqueries.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
